#set.seed(42)
#setwd("~/Desktop/r-583")
cat( red("*** The Dataset *** \n \n") )
## *** The Dataset ***
##
Data<- read.table("breast-cancer-wisconsin.data", head=T)
Data<-na.omit(Data)
Data<- unique(Data)
names(Data)[11] <- "Class"
Data$X1 <- as.numeric(Data$X1)
Data$X2 <- as.numeric(Data$X2)
Data$X3 <- as.numeric(Data$X3)
Data$X4 <- as.numeric(Data$X4)
Data$X5 <- as.numeric(Data$X5)
Data$X6 <- as.numeric(Data$X6)
Data$X7 <- as.numeric(Data$X7)
Data$X8 <- as.numeric(Data$X8)
Data$X9 <- as.numeric(Data$X9)
Data
cat(red("*** The sample means, sample covariance matrix, and sample correlation matrix 9 variables *** \n\n"))
## *** The sample means, sample covariance matrix, and sample correlation matrix 9 variables ***
#install.packages("tidyverse")
cat(red("The means of columns \n \n"))
## The means of columns
##
round(colMeans(Data[2:10]), digits = 4)
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## 4.4269 3.1302 3.2012 2.8249 3.2113 3.1650 3.4356 2.8828 1.5933
cat(red("The covariance matrix \n \n"))
## The covariance matrix
##
round(cov(Data[,2:10]), digits=5)
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## X1 7.92907 5.50953 5.44878 3.93863 3.20532 2.16135 3.85869 4.62548 1.69995
## X2 5.50953 9.24968 8.15057 6.21704 4.99853 2.84080 5.64173 6.78196 2.41101
## X3 5.44878 8.15057 8.76093 5.86281 4.65164 2.76966 5.33834 6.57724 2.24714
## X4 3.93863 6.21704 5.86281 8.21712 3.83705 1.86661 4.68943 5.29539 2.06057
## X5 3.20532 4.99853 4.65164 3.83705 4.83935 1.72741 3.33392 4.27698 1.82952
## X6 2.16135 2.84080 2.76966 1.86661 1.72741 4.60753 1.99035 2.69908 0.86864
## X7 3.85869 5.64173 5.33834 4.68943 3.33392 1.99035 5.96505 5.01056 1.44696
## X8 4.62548 6.78196 6.57724 5.29539 4.27698 2.69908 5.01056 9.40218 2.25951
## X9 1.69995 2.41101 2.24714 2.06057 1.82952 0.86864 1.44696 2.25951 2.96917
cat(red("The correlation matrix \n \n"))
## The correlation matrix
##
round(cor(Data[,2:10]), digits=5)
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## X1 1.00000 0.64334 0.65375 0.48795 0.51745 0.35759 0.56108 0.53571 0.35035
## X2 0.64334 1.00000 0.90542 0.71312 0.74711 0.43515 0.75953 0.72724 0.46006
## X3 0.65375 0.90542 1.00000 0.69099 0.71439 0.43593 0.73845 0.72469 0.44059
## X4 0.48795 0.71312 0.69099 1.00000 0.60848 0.30336 0.66981 0.60245 0.41717
## X5 0.51745 0.74711 0.71439 0.60848 1.00000 0.36582 0.62052 0.63406 0.48264
## X6 0.35759 0.43515 0.43593 0.30336 0.36582 1.00000 0.37965 0.41008 0.23485
## X7 0.56108 0.75953 0.73845 0.66981 0.62052 0.37965 1.00000 0.66906 0.34382
## X8 0.53571 0.72724 0.72469 0.60245 0.63406 0.41008 0.66906 1.00000 0.42764
## X9 0.35035 0.46006 0.44059 0.41717 0.48264 0.23485 0.34382 0.42764 1.00000
cat(red("the summary \n \n"))
## the summary
##
summary(Data[,2:10], digits=1)
## X1 X2 X3 X4 X5 X6
## Min. : 1 Min. : 1 Min. : 1 Min. : 1 Min. : 1 Min. : 1
## 1st Qu.: 2 1st Qu.: 1 1st Qu.: 1 1st Qu.: 1 1st Qu.: 2 1st Qu.: 2
## Median : 4 Median : 1 Median : 1 Median : 1 Median : 2 Median : 2
## Mean : 4 Mean : 3 Mean : 3 Mean : 3 Mean : 3 Mean : 3
## 3rd Qu.: 6 3rd Qu.: 5 3rd Qu.: 5 3rd Qu.: 4 3rd Qu.: 4 3rd Qu.: 3
## Max. :10 Max. :10 Max. :10 Max. :10 Max. :10 Max. :11
## X7 X8 X9
## Min. : 1 Min. : 1 Min. : 1
## 1st Qu.: 2 1st Qu.: 1 1st Qu.: 1
## Median : 3 Median : 1 Median : 1
## Mean : 3 Mean : 3 Mean : 2
## 3rd Qu.: 5 3rd Qu.: 4 3rd Qu.: 1
## Max. :10 Max. :10 Max. :10
cat (red("*** Using different visualization plots to get a deeper understanding of the relationshipS between the different featurers (X1 To X9) *** " ))
## *** Using different visualization plots to get a deeper understanding of the relationshipS between the different featurers (X1 To X9) ***
#install.packages("pairsD3")
require(pairsD3)
## Loading required package: pairsD3
pairsD3(Data[, 2:10][,c(1,2,3,4,5,6,7,8,9)], opacity = 0.9, cex = 2, width = 1000)
#install.packages("psych")
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:crayon':
##
## %+%
pairs.panels(Data[,2:10],
method = "pearson", # correlation method
hist.col = "#00AFBB",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)
par(mfrow=c(3,3))
library(lattice)
xyplot(Data$X1 ~ Data$X2, groups = Data$Class)
xyplot(Data$X1 ~ Data$X3, groups = Data$Class)
xyplot(Data$X1 ~ Data$X4, groups = Data$Class)
xyplot(Data$X1 ~ Data$X5, groups = Data$Class)
xyplot(Data$X1 ~ Data$X6, groups = Data$Class)
xyplot(Data$X1 ~ Data$X7, groups = Data$Class)
xyplot(Data$X1 ~ Data$X8, groups = Data$Class)
xyplot(Data$X1 ~ Data$X9, groups = Data$Class)
xyplot(Data$X2 ~ Data$X3, groups = Data$Class)
library(MASS)
den1 <- kde2d(Data$X1, Data$X2,n=60)
persp(den1$x,den1$y,den1$z, xlab="X1", ylab="X2", zlab="Density", theta= 30, col=rainbow(15))
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## The following object is masked from 'package:crayon':
##
## %+%
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(Data[, 2:4])
attach(mydata2)
library(car)
scatterplotMatrix( ~X1+X2+X3+X4+X5+X6+X7+X8+X9|Class , data=mydata2, diagonal=list(method="boxplot" ) ,regLine = list(method=lm, lty=1,lwd=2,col="red"), smooth=list(smoother=loessLine, spread=FALSE,lty.smooth=2,lwd.smooth=1.5, lty.spread=3, lwd.spread=.5),main="Scatterplot Matrix")
# Groupby function for dataframe in R
mydata2$Type[mydata2$Class==2] <- "benign"
mydata2$Type[mydata2$Class==4] <- "malignant"
length(which(mydata2$Class=="malignant" ))
## [1] 0
length(which(mydata2$Class=="benign" ))
## [1] 0
head(mydata2)
cat(red("Here we notice that if Clump Thickness is high , the possibility to have malignant cancer is higher "))
## Here we notice that if Clump Thickness is high , the possibility to have malignant cancer is higher
summarise_at(group_by(mydata2,Class),vars(X1),list(~ mean(., trim = .2), ~ median(., na.rm = TRUE)))
# Groupby function for dataframe in R
cat(red("Here we notice that if Uniformity of Cell Size is high , the possibility to have malignant cancer is higher"))
## Here we notice that if Uniformity of Cell Size is high , the possibility to have malignant cancer is higher
summarise_at(group_by(mydata2,Class),vars(X2),list(~ mean(., trim = .2), ~ median(., na.rm = TRUE)))
cat(red("
Here we notice that if Marginal Adhesion is high , the possibility to have malignant cancer is higher"))
##
## Here we notice that if Marginal Adhesion is high , the possibility to have malignant cancer is higher
# Groupby function for dataframe in R
mydata2$Class[mydata2$Class==2] <- "benign"
mydata2$Class[mydata2$Class==4] <- "malignant"
summarise_at(group_by(mydata2,Class),vars(X4),list(~ mean(., trim = .2), ~ median(., na.rm = TRUE)))
cat(red("Here we notice that if Bland Chromatin is high , the possibility to have malignant cancer is higher"))
## Here we notice that if Bland Chromatin is high , the possibility to have malignant cancer is higher
# Groupby function for dataframe in R
mydata2$Class[mydata2$Class==2] <- "benign"
mydata2$Class[mydata2$Class==4] <- "malignant"
summarise_at(group_by(mydata2,Class),vars(X7),list(~ mean(., trim = .2), ~ median(., na.rm = TRUE)))
cat(red("Here we notice that if Normal Nucleoli is high , the possibility to have malignant cancer is higher"))
## Here we notice that if Normal Nucleoli is high , the possibility to have malignant cancer is higher
# Groupby function for dataframe in R
mydata2$Class[mydata2$Class==2] <- "benign"
mydata2$Class[mydata2$Class==4] <- "malignant"
summarise_at(group_by(mydata2,Class),vars(X8),list(~ mean(., trim = .2), ~ median(., na.rm = TRUE)))
library(lattice)
myPars <- list(superpose.symbol = list(pch =mydata2$Class, col = c("red", "blue")))
splom(mydata2[,2:10], groups = mydata2$Class, subset = TRUE, panel = panel.superpose,
auto.key = list(title="By cancer type", columns=2),
par.settings = myPars)
#faces(mydata2[,2:10])
cat(red("*** Normality Tests for Statistical Analysis *** \n \n
** we need do the univarite normality tests for all three margins: \n"))
## *** Normality Tests for Statistical Analysis ***
##
##
## ** we need do the univarite normality tests for all three margins:
par(mfrow=c(3,3))
qqpoints=qqnorm((Data$X1),lwd=2, main="QQ-plot for X1" )
qqline((Data$X1),col="blue", cex=2, lwd=2)
qqpoints=qqnorm((Data$X2),lwd=2, main="QQ-plot for X2" )
qqline((Data$X2),col="blue", cex=2, lwd=2)
qqpoints=qqnorm((Data$X3),lwd=2, main="QQ-plot for X3" )
qqline((Data$X3),col="blue", cex=2, lwd=2)
qqpoints=qqnorm((Data$X4),lwd=2, main="QQ-plot for X4" )
qqline((Data$X4),col="blue", cex=2, lwd=2)
qqpoints=qqnorm((Data$X5),lwd=2, main="QQ-plot for X5" )
qqline((Data$X5),col="blue", cex=2, lwd=2)
qqpoints=qqnorm((Data$X6),lwd=2, main="QQ-plot for X6" )
qqline((Data$X6),col="blue", cex=2, lwd=2)
qqpoints=qqnorm((Data$X7),lwd=2, main="QQ-plot for X7" )
qqline((Data$X7),col="blue", cex=2, lwd=2)
qqpoints=qqnorm((Data$X8),lwd=2, main="QQ-plot for X8" )
qqline((Data$X8),col="blue", cex=2, lwd=2)
qqpoints=qqnorm((Data$X9),lwd=2, main="QQ-plot for X9" )
qqline((Data$X9),col="blue", cex=2, lwd=2)
#install.packages("nortest")
library(nortest)
cat(red("Based on Anderson-Darling test, all marginal variables are NOT normally distributed (SMALL p-values). \n \n"))
## Based on Anderson-Darling test, all marginal variables are NOT normally distributed (SMALL p-values).
##
ad.test(mydata2[,2])$p.value
## [1] 3.7e-24
ad.test(mydata2[,3])$p.value
## [1] 3.7e-24
ad.test(mydata2[,4])$p.value
## [1] 3.7e-24
ad.test(mydata2[,5])$p.value
## [1] 3.7e-24
ad.test(mydata2[,6])$p.value
## [1] 3.7e-24
ad.test(mydata2[,7])$p.value
## [1] 3.7e-24
ad.test(mydata2[,8])$p.value
## [1] 3.7e-24
ad.test(mydata2[,9])$p.value
## [1] 3.7e-24
ad.test(mydata2[,10])$p.value
## [1] 3.7e-24
cat(red("** check whether or not they are following multivariate normal."))
## ** check whether or not they are following multivariate normal.
cat(red('1. The chi-square QQ-plot \n'))
## 1. The chi-square QQ-plot
source("qqchi2.R")
qqchi2(mydata2[,2:10])
## correlation coefficient: 0.9518349
cat(blue("NOTE:"))
## NOTE:
cat(("
correlation coefficient: 0.9515688 Based on the qq plot, we can see that the values of d2 j are not close to the Chi-square distribution with 9 degree of freedom (the values are not deviated from the straight line), this might be a good indication that nine-variate data has not a multivariate normal distribution. Now, let’s try some multivarite normal tests:"))
##
## correlation coefficient: 0.9515688 Based on the qq plot, we can see that the values of d2 j are not close to the Chi-square distribution with 9 degree of freedom (the values are not deviated from the straight line), this might be a good indication that nine-variate data has not a multivariate normal distribution. Now, let’s try some multivarite normal tests:
cat(red("2.MLEs for Multivariate Normal distribution \n \n"))
## 2.MLEs for Multivariate Normal distribution
##
source("testnormality.R")
cat(red("p.value is :\n"))
## p.value is :
testnormality(mydata2[,2:10])
## [1] 3.904808e-32
#install.packages("energy" )
cat(red("3.Energy test of multivariate normality \n \n"))
## 3.Energy test of multivariate normality
##
library(energy)
mvnorm.etest(mydata2[,2:10], R=999)
##
## Energy test of multivariate normality: estimated parameters
##
## data: x, sample size 691, dimension 9, replicates 999
## E-statistic = 88.798, p-value < 2.2e-16
cat(red("4.Test for multivariate skewness and kurtosis \n \n"))
## 4.Test for multivariate skewness and kurtosis
##
mn <- mult.norm(mydata2[,2:10], chicrit=0.001) # X is the Stiffness of boards data set
mn$mult.test
## Beta-hat kappa p-val
## Skewness 40.77652 4696.09605 0
## Kurtosis 194.98910 89.65992 0
cat(red("5.Test for Mardia's multivariate tests \n \n"))
## 5.Test for Mardia's multivariate tests
##
library(QuantPsyc)
mult.norm(mydata2[,2:10],chicrit=0.001)$mult.test
## Beta-hat kappa p-val
## Skewness 40.77652 4696.09605 0
## Kurtosis 194.98910 89.65992 0
source("qqbeta.R")
cat (red("6.QQ-plot of Beta for stiffness data"))
## 6.QQ-plot of Beta for stiffness data
qqbeta(mydata2[,2:10])
## correlation coefficient: 0.9508116
cat (red("7.Hotellings T^2 Test"))
## 7.Hotellings T^2 Test
HotellingsT2Test(mydata2[,2:10],test="chi")
##
## Hotelling's one sample T2-test
##
## data: mydata2[, 2:10]
## T.2 = 3385.8, df = 9, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to c(0,0,0,0,0,0,0,0,0)
cat (blue("NOTE: \n "))
## NOTE:
##
cat("All of the multivariate tests above are having a small p-value, which would not reject the null hypothesis that the data are not following a multivariate normal distribution.")
## All of the multivariate tests above are having a small p-value, which would not reject the null hypothesis that the data are not following a multivariate normal distribution.
par(mfrow=c(1,2))
bv.boxplot(Data$X1,Data$X8,ID.out = TRUE, robust = TRUE, D = 7, pch = 21, bg.out='blue', bg = "red", hinge.col = 1, fence.col = 1,hinge.lty = 2, fence.lty = 3, names = unlist(strsplit( paste(utilities[,1]), split=", ")), cex.ID.out = 0.7, uni.CI = FALSE, uni.conf = 0.95,uni.CI.col = 1, uni.CI.lty = 1, uni.CI.lwd = 2, show.points = TRUE,ylab="X1", xlab="X8")
bv.boxplot(Data$X1,Data$X2,ID.out = TRUE, robust = TRUE, D = 7, pch = 21, bg.out='blue', bg = "red", hinge.col = 1, fence.col = 1,hinge.lty = 2, fence.lty = 3, cex.ID.out = 0.7, uni.CI = FALSE, uni.conf = 0.95,uni.CI.col = 1, uni.CI.lty = 1, uni.CI.lwd = 2, show.points = TRUE,ylab="X1", xlab="X2")
bv.boxplot(Data$X3, Data$X4, main="Scatterplot 1", ID.out = TRUE,bg.out ="green", xlab="X3", ylab="X4", pch=19)
par(mfrow=c(1,3))
bv.boxplot(X5, X7, main="Scatterplot 1", ID.out = TRUE,bg.out ="green", xlab="X5", ylab="X7", pch=19)
bv.boxplot(X5, X8, main="Scatterplot 1", ID.out = TRUE,bg.out ="green", xlab="X5", ylab="X8", pch=19)
bv.boxplot(X5, X9, main="Scatterplot 1", ID.out = TRUE,bg.out ="green", xlab="X5", ylab="X9", pch=19)
par(mfrow=c(1,3))
bv.boxplot(X7, X8, main="Scatterplot 1", ID.out = TRUE,bg.out ="green", xlab="X7", ylab="X8", pch=19)
bv.boxplot(X7, X9, main="Scatterplot 1", ID.out = TRUE,bg.out ="green", xlab="X7", ylab="X9", pch=19)
bv.boxplot(X8, X9, main="Scatterplot 1", ID.out = TRUE,bg.out ="green", xlab="X8", ylab="X9", pch=19)
#install.packages("asbio")
library(asbio)
data2= Data[-c(186,347,509,321,582,509,355,684,85,505,344,185,503,500,342,184,497,340,183), ]
par(mfrow=c(1,2))
bv.boxplot(data2$X1,data2$X8, ID.out = TRUE, robust = TRUE, D = 7, pch = 21, bg.out='blue', bg = "red", hinge.col = 1, fence.col = 1,hinge.lty = 2, fence.lty = 3, names = unlist(strsplit( paste(utilities[,1]), split=", ")), cex.ID.out = 0.7, uni.CI = FALSE, uni.conf = 0.95,uni.CI.col = 1, uni.CI.lty = 1, uni.CI.lwd = 2, show.points = TRUE,ylab="X1", xlab="X8")
bv.boxplot(data2$X1,data2$X2, ID.out = TRUE,robust = TRUE, D = 7, pch = 21, bg.out='blue', bg = "red", hinge.col = 1, fence.col = 1,hinge.lty = 2, fence.lty = 3, names = unlist(strsplit( paste(utilities[,1]), split=", ")), cex.ID.out = 0.7, uni.CI = FALSE, uni.conf = 0.95,uni.CI.col = 1, uni.CI.lty = 1, uni.CI.lwd = 2, show.points = TRUE,ylab="X1", xlab="X2")
cat (red("Confidence Regions: \n \n"))
## Confidence Regions:
##
cat(red("1.Simultaneous Confidence Intervals based on T^2 \n"))
## 1.Simultaneous Confidence Intervals based on T^2
#install.packages("mvdalab")
library(mvdalab)
##
## Attaching package: 'mvdalab'
## The following object is masked from 'package:psych':
##
## smc
MVcis(Data[,2:10],include.zero=T, level=0.95)
## [,1] [,2]
## X1 3.982113 4.871722
## X2 2.649826 3.610666
## X3 2.733603 3.668713
## X4 2.372080 3.277703
## X5 2.863791 3.558785
## X6 2.825906 3.504050
## X7 3.049798 3.821403
## X8 2.398414 3.367143
## X9 1.321151 1.865535
cat(red("2.Calculate Bonferroni Confidence Intervals\n \n"))
## 2.Calculate Bonferroni Confidence Intervals
##
alpha=0.05
nc=ncol(Data[,2:10])
nr=nrow(Data[,2:10])
xbar2=colMeans(Data[,2:10])
xvar=var(Data[,2:10])
se=sqrt(diag(xvar))/sqrt(nr)
bonfcr=matrix(0,nc,2)
q=1-(alpha/(2*nc))
cr=qt(q,(nr-1))
for (i in 1:nc)
{
bonfcr[i,1]=xbar2[i]-cr*se[i]
bonfcr[i,2]=xbar2[i]+cr*se[i]
}
bonfcr
## [,1] [,2]
## [1,] 4.128943 4.724892
## [2,] 2.808413 3.452079
## [3,] 2.887943 3.514373
## [4,] 2.521553 3.128230
## [5,] 2.978500 3.444076
## [6,] 2.937834 3.392122
## [7,] 3.177152 3.694049
## [8,] 2.558303 3.207254
## [9,] 1.411002 1.775684
cat(red("3. one-at-a-time confidence intervals \n \n"))
## 3. one-at-a-time confidence intervals
##
indvcr=matrix(0,nc,2)
q=1-(0.05/2)
cr=qt(q,(nr-1))
for (i in 1:nc){
indvcr[i,1]=xbar2[i]-cr*se[i]
indvcr[i,2]=xbar2[i]+cr*se[i]
}
indvcr
## [,1] [,2]
## [1,] 4.216596 4.637239
## [2,] 2.903084 3.357408
## [3,] 2.980079 3.422236
## [4,] 2.610784 3.038999
## [5,] 3.046978 3.375598
## [6,] 3.004652 3.325305
## [7,] 3.253178 3.618023
## [8,] 2.653752 3.111805
## [9,] 1.464640 1.722046
cat (blue("NOTE\n"))
## NOTE
cat(("we will use chi square distribution instead to F-distribution because it is not normal \n"))
## we will use chi square distribution instead to F-distribution because it is not normal
cat(red("4.Calculate Confidence Intervals based on chi-distrbution insted to f-distrbution \n \n"))
## 4.Calculate Confidence Intervals based on chi-distrbution insted to f-distrbution
##
chi_=matrix(0,nc,2)
cr=sqrt(qchisq((1-alpha),nc))
for (i in 1:nc)
{
chi_[i,1]=xbar2[i]-cr*se[i]
chi_[i,2]=xbar2[i]+cr*se[i]
}
chi_
## [,1] [,2]
## [1,] 3.986303 4.867532
## [2,] 2.654351 3.606141
## [3,] 2.738006 3.664309
## [4,] 2.376345 3.273438
## [5,] 2.867064 3.555512
## [6,] 2.829100 3.500857
## [7,] 3.053432 3.817769
## [8,] 2.402976 3.362581
## [9,] 1.323714 1.862971
X <- Data[,2:10]
n <- nrow(X)
p <- ncol(X)
xbar <- colMeans(X)
S <- cov(X)
T.ci <- function(mu, Sigma, n, avec=rep(1,length(mu)), level=0.95){
p <-length(mu)
if(nrow(Sigma)!=p) stop("Need length(mu) == nrow(Sigma).")
if(ncol(Sigma)!=p) stop("Need length(mu) == ncol(Sigma).")
if(length(avec)!=p) stop("Need length(mu) == length(avec).")
if(level <=0 | level >= 1) stop("Need 0 < level < 1.")
cval <- qf(level, p, n-p)*p*(n-1) / (n-p)
zhat <- crossprod(avec, mu)
zvar <- crossprod(avec, Sigma %*% avec) / n
const <- sqrt(cval*zvar)
c(lower = zhat - const, upper = zhat + const)}
TCI <- tCI <- bon <- NULL
alpha <- 1 - (0.05/(2*n))
for(k in 1:p){
avec <- rep(0, p)
avec[k] <- 1
TCI <- c(TCI, T.ci(xbar, S, n, avec))
tCI <- c(tCI,
xbar[k] - sqrt(S[k,k]/n)*qt(0.975, df=n-1),
xbar[k] + sqrt(S[k,k]/n)*qt(0.975, df=n-1))
bon <- c(bon,
xbar[k] - sqrt(S[k,k]/n)*qt(alpha, df=n-1),
xbar[k] + sqrt(S[k,k]/n)*qt(alpha, df=n-1))
}
chi <- NULL
for(k in 1:p){
chi <- c(chi,
xbar[k] - sqrt(S[k,k]/n)*sqrt(qchisq(0.95, df=p)),
xbar[k] + sqrt(S[k,k]/n)*sqrt(qchisq(0.95, df=p)))
}
cat(red("*** using different code *** \n \n"))
## *** using different code ***
##
rtab <- rbind(TCI, bon,tCI,chi)
rtab
## lower upper lower upper lower upper lower upper
## TCI 3.982113 4.871722 2.649826 3.610666 2.733603 3.668713 2.372080 3.277703
## bon 3.999228 4.854607 2.668311 3.592181 2.751593 3.650723 2.389503 3.260280
## tCI 4.216596 4.637239 2.903084 3.357408 2.980079 3.422236 2.610784 3.038999
## chi 3.986303 4.867532 2.654351 3.606141 2.738006 3.664309 2.376345 3.273438
## lower upper lower upper lower upper lower upper
## TCI 2.863791 3.558785 2.825906 3.504050 3.049798 3.821403 2.398414 3.367143
## bon 2.877162 3.545414 2.838953 3.491003 3.064643 3.806558 2.417051 3.348506
## tCI 3.046978 3.375598 3.004652 3.325305 3.253178 3.618023 2.653752 3.111805
## chi 2.867064 3.555512 2.829100 3.500857 3.053432 3.817769 2.402976 3.362581
## lower upper
## TCI 1.321151 1.865535
## bon 1.331624 1.855062
## tCI 1.464640 1.722046
## chi 1.323714 1.862971
cat(red("*** Multivariate of Two-Sample Tests *** \n"))
## *** Multivariate of Two-Sample Tests ***
cat(red("*** Confidence Interval for Two Independent Samples (different sample sizes) *** \n"))
## *** Confidence Interval for Two Independent Samples (different sample sizes) ***
cat(red("*** test H0 : µ1 = µ2 versus Ha : µ1 ≠µ2. *** \n"))
## *** test H0 : µ1 = µ2 versus Ha : µ1 ≠µ2. ***
cat(blue("NOTE \n"))
## NOTE
cat(("order the data based on Class column\n\n"))
## order the data based on Class column
data1 <- Data[order(Class),]
benign=data1[1:458,2:10]
malignant=data1[459:691,2:10]
T.test <- function(X, Y=NULL, mu=0, paired=FALSE, asymp=FALSE, var.equal=TRUE){
if(is.null(Y)){
# one-sample T^2 test: same code as before (omitted here)
} else {
if(paired){
# dependent two-sample T^2 test: same code as before (omitted here)
} else {
# independent two-sample T^2 test
X <- as.matrix(X)
Y <- as.matrix(Y)
nx <- nrow(X)
ny <- nrow(Y)
p <- ncol(X)
if(p != ncol(Y)) stop("Need ncol(X) == ncol(Y).")
if(min(nx,ny) <= p) stop("Need min(nrow(X),nrow(Y)) > ncol(X).")
dbar <- colMeans(X) - colMeans(Y)
if(var.equal){
df2 <- nx + ny - p - 1
Sp <- ((nx-1)*cov(X) + (ny-1)*cov(Y)) / (nx + ny - 2)
T2 <- (1/((1/nx) + (1/ny))) * t(dbar - mu) %*% solve(Sp) %*% (dbar - mu)
Fstat <- T2 / ((nx + ny - 2) * p / df2)
} else {
Sx <- cov(X)
Sy <- cov(Y)
Sp <- (Sx/nx) + (Sy/ny)
T2 <- t(dbar - mu) %*% solve(Sp) %*% (dbar - mu)
SpInv <- solve(Sp)
SxSpInv <- (1/nx) * Sx %*% SpInv
SySpInv <- (1/ny) * Sy %*% SpInv
nudx <- (sum(diag(SxSpInv %*% SxSpInv)) + (sum(diag(SxSpInv)))^2) / nx
nudy <- (sum(diag(SySpInv %*% SySpInv)) + (sum(diag(SySpInv)))^2) / ny
nu <- (p + p^2) / (nudx + nudy)
df2 <- nu - p + 1
Fstat <- T2 / (nu * p / df2)
}
if(asymp){
pval <- 1 - pchisq(T2, df=p)
} else {
pval <- 1 - pf(Fstat, df1=p, df2=df2)
}
return(data.frame(T2=as.numeric(T2), Fstat=as.numeric(Fstat),
df1=p, df2=df2, p.value=as.numeric(pval),
type="ind-sample", asymp=asymp, var.equal=var.equal, row.names=""))
} # end if(paired)
} # end if(is.null(Y))
} # end T.test function
cat(blue("NOTE \n"))
## NOTE
cat("asymptotically works for non-normal multivariate\n\n")
## asymptotically works for non-normal multivariate
cat(red("Multivariate Independent Samples T^2 Test\n\n"))
## Multivariate Independent Samples T^2 Test
T.test(benign, malignant,asymp=TRUE)
cat(red("Multivariate Independent Samples T^2 Test Inferences when Σ1 ≠Σ2.\n\n"))
## Multivariate Independent Samples T^2 Test Inferences when Σ1 ≠Σ2.
T.test(benign, malignant, var.equal=FALSE,asymp=TRUE)
cat(red("** using different code \n"))
## ** using different code
library(DescTools)
HotellingsT2Test( benign,malignant,test="chi")
##
## Hotelling's two sample T2-test
##
## data: benign and malignant
## T.2 = 2451.8, df = 9, p-value < 2.2e-16
## alternative hypothesis: true location difference is not equal to c(0,0,0,0,0,0,0,0,0)
cat(red("*** Multivariate of Two-Sample Tests *** \n"))
## *** Multivariate of Two-Sample Tests ***
cat(red("*** Confidence Interval for Two Independent Samples (different sample sizes) *** \n"))
## *** Confidence Interval for Two Independent Samples (different sample sizes) ***
cat(red("*** test H0 : Σ1 = Σ2 versus Ha : Σ1 ≠Σ2. *** \n"))
## *** test H0 : Σ1 = Σ2 versus Ha : Σ1 ≠Σ2. ***
BoxM(data= log(Data[,2:10]),group=Data[,11])
## $Chisq
## [1] Inf
##
## $df
## [1] 45
##
## $p.value
## [1] 0
##
## $Test
## [1] "BoxM"
##
## attr(,"class")
## [1] "MVTests" "list"
#install.packages("rpanel", dependencies = TRUE)
#install.packages("Rcpp", dependencies = TRUE)
#install.packages("classInt", dependencies = TRUE)
#install.packages("SpatialEpi", dependencies = TRUE)
#install.packages("biotools", dependencies = TRUE)
#library(biotools)
#boxM(mydata2[,2:10],mydata2[,11])
cat(blue("NOTE \n"))
## NOTE
cat("The Box-M statistic is inf with p-value zero. Thus, the null hypothesis is rejected at the 5% level.")
## The Box-M statistic is inf with p-value zero. Thus, the null hypothesis is rejected at the 5% level.
X <- as.matrix(benign)
Y <- as.matrix(malignant)
nx <- nrow(X)
ny <- nrow(Y)
nc=ncol(malignant)
nr=nrow(malignant)
xbar2=colMeans(malignant)-colMeans(benign)
xvar=((nx-1)*cov(X) + (ny-1)*cov(Y)) / (nx + ny - 2)
se=sqrt(diag(xvar))/sqrt(nr)
cat(red("*** Two sample confidence interval calculator ***"))
## *** Two sample confidence interval calculator ***
cat(red("1.Calculate Bonferroni Confidence Intervals\n \n"))
## 1.Calculate Bonferroni Confidence Intervals
##
alpha=0.05
bonfcr=matrix(0,nc,2)
q=1-(alpha/(2*nc))
cr=qt(q,(nr-1))
for (i in 1:nc)
{
bonfcr[i,1]=xbar2[i]-cr*se[i]
bonfcr[i,2]=xbar2[i]+cr*se[i]
}
bonfcr
## [,1] [,2]
## [1,] 3.819122 4.553744
## [2,] 4.854425 5.514430
## [3,] 4.723836 5.366293
## [4,] 3.780015 4.544572
## [5,] 2.820219 3.418943
## [6,] 1.922102 2.604876
## [7,] 3.572261 4.165730
## [8,] 4.231179 5.019529
## [9,] 1.259662 1.832285
cat(red("2.Calculate Confidence Intervals based on chi-distrbution \n \n"))
## 2.Calculate Confidence Intervals based on chi-distrbution
##
chi_=matrix(0,nc,2)
cr=sqrt(qchisq((1-alpha),nc))
for (i in 1:nc)
{
chi_[i,1]=xbar2[i]-cr*se[i]
chi_[i,2]=xbar2[i]+cr*se[i]
}
chi_
## [,1] [,2]
## [1,] 3.646673 4.726193
## [2,] 4.699493 5.669363
## [3,] 4.573023 5.517106
## [4,] 3.600539 4.724048
## [5,] 2.679672 3.559491
## [6,] 1.761825 2.765154
## [7,] 3.432947 4.305044
## [8,] 4.046117 5.204590
## [9,] 1.125241 1.966706
cat(red("3. one-at-a-time confidence intervals \n \n"))
## 3. one-at-a-time confidence intervals
##
indvcr=matrix(0,nc,2)
q=1-(0.05/2)
cr=qt(q,(nr-1))
for (i in 1:nc){
indvcr[i,1]=xbar2[i]-cr*se[i]
indvcr[i,2]=xbar2[i]+cr*se[i]
}
indvcr
## [,1] [,2]
## [1,] 3.927890 4.444976
## [2,] 4.952145 5.416710
## [3,] 4.818958 5.271171
## [4,] 3.893215 4.431372
## [5,] 2.908866 3.330296
## [6,] 2.023194 2.503785
## [7,] 3.660130 4.077861
## [8,] 4.347901 4.902806
## [9,] 1.344444 1.747503
cat(red("*** One Way MANOVA *** \n\n"))
## *** One Way MANOVA ***
cat(blue("Research Question: \n" ))
## Research Question:
cat("Do dependent variable 1 until dependent variable 9 differ by independent variable (group 1 vs. group 2)? \n \n" )
## Do dependent variable 1 until dependent variable 9 differ by independent variable (group 1 vs. group 2)?
##
cat(red("Ho: Dependent 1 until dependent variable 9 do not differ by (independent variable (group 1 vs. group 2). \n"))
## Ho: Dependent 1 until dependent variable 9 do not differ by (independent variable (group 1 vs. group 2).
cat (green("Ha: Dependent 1 until dependent variable 9 differ by independent variable (group 1 vs. group 2). \n \n \n"))
## Ha: Dependent 1 until dependent variable 9 differ by independent variable (group 1 vs. group 2).
##
##
library(crayon)
library(car)
fit.manova <- manova(cbind(X1,X2,X3,X4,X5,X6,X8,X9) ~ Class, data = Data)
summary(fit.manova)
## Df Pillai approx F num Df den Df Pr(>F)
## Class 1 0.7937 327.98 8 682 < 2.2e-16 ***
## Residuals 689
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat(red("*** One Way ANOVA *** \n\n"))
## *** One Way ANOVA ***
cat(blue("Research Question: \n" ))
## Research Question:
cat("Is there a statistically significant difference on a dependent variable by independent variable? \n \n")
## Is there a statistically significant difference on a dependent variable by independent variable?
##
cat(red("Ho: There is not a statistically significant difference on a dependent variable by independent variable (group 1 vs. group 2)? \n" ))
## Ho: There is not a statistically significant difference on a dependent variable by independent variable (group 1 vs. group 2)?
cat(green("Ha: There is a statistically significant difference on a dependent variable by independent variable (group 1 vs. group 2)? \n \n\n" ))
## Ha: There is a statistically significant difference on a dependent variable by independent variable (group 1 vs. group 2)?
##
#install.packages("crayon")
library(crayon)
library(car)
mod <- lm( cbind(X1,X2,X3,X4,X5,X6,X8,X9) ~ Class,data=Data)
fit1 <- Anova(mod, type=2)
fit1
##
## Type II MANOVA Tests: Pillai test statistic
## Df test stat approx F num Df den Df Pr(>F)
## Class 1 0.7937 327.98 8 682 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat(red(" Study the effect the feature X1 on independent variable Class \n\n " ))
## Study the effect the feature X1 on independent variable Class
##
##
fit.anova1 <- aov( X1~ Factor1, data=Data)
summary(fit.anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Factor1 1 2812 2812.1 728.7 <2e-16 ***
## Residuals 689 2659 3.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat(red(" Study the effect the feature X2 on independent variable Class \n\n" ))
## Study the effect the feature X2 on independent variable Class
fit.anova1 <- aov( X2~ Factor1, data=Data)
summary(fit.anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Factor1 1 4268 4268 1390 <2e-16 ***
## Residuals 689 2115 3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat(red(" Study the effect the feature X3 on independent variable Class \n\n" ))
## Study the effect the feature X3 on independent variable Class
fit.anova1 <- aov( X3~ Factor1, data=Data)
summary(fit.anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Factor1 1 4042 4042 1390 <2e-16 ***
## Residuals 689 2003 3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat(red(" Study the effect the feature X4 on independent variable Class \n\n" ))
## Study the effect the feature X4 on independent variable Class
fit.anova1 <- aov( X4~ Factor1, data=Data)
summary(fit.anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Factor1 1 2789 2789.1 667.1 <2e-16 ***
## Residuals 689 2881 4.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat(red(" Study the effect the feature X5 on independent variable Class \n\n" ))
## Study the effect the feature X5 on independent variable Class
fit.anova1 <- aov( X5~ Factor1, data=Data)
summary(fit.anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Factor1 1 1550 1549.6 596.6 <2e-16 ***
## Residuals 689 1790 2.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat(red(" Study the effect the feature X6 on independent variable Class \n\n" ))
## Study the effect the feature X6 on independent variable Class
fit.anova1 <- aov( X6~ Factor1, data=Data)
summary(fit.anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Factor1 1 820.2 820.2 239.6 <2e-16 ***
## Residuals 689 2359.0 3.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat(red(" Study the effect the feature X7 on independent variable Class \n\n" ))
## Study the effect the feature X7 on independent variable Class
fit.anova1 <- aov( X7~ Factor1, data=Data)
summary(fit.anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Factor1 1 2356 2356.2 922.6 <2e-16 ***
## Residuals 689 1760 2.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat(red(" Study the effect the feature X8 on independent variable Class \n\n" ))
## Study the effect the feature X8 on independent variable Class
fit.anova1 <- aov( X8~ Factor1, data=Data)
summary(fit.anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Factor1 1 3322 3322 722.9 <2e-16 ***
## Residuals 689 3166 5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat(red(" Study the effect the feature X9 on independent variable Class \n\n" ))
## Study the effect the feature X9 on independent variable Class
fit.anova1 <- aov( X9~ Factor1, data=Data)
summary(fit.anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Factor1 1 368.5 368.5 151.1 <2e-16 ***
## Residuals 689 1680.2 2.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat(blue("NOTE \n\n"))
## NOTE
cat("The results show that the significant contributions to all features (X1 TO X9) are the Class. \n")
## The results show that the significant contributions to all features (X1 TO X9) are the Class.
cat(red("The Bonferroni simultaneous confidence intervals for each feature effects are obtained in below \n\n"))
## The Bonferroni simultaneous confidence intervals for each feature effects are obtained in below
fit.manova.summary=summary(fit.manova)
SSP.res=fit.manova.summary$SS$Residuals
SSP.res
## X1 X2 X3 X4 X5 X6 X8
## X1 2658.93068 337.32152 388.27922 -82.93348 124.14770 -27.39895 135.3194
## X2 337.32152 2114.66764 1470.70017 839.71770 877.36930 89.23094 914.5480
## X3 388.27922 1470.70017 2003.19057 687.79361 706.96116 90.30638 874.2315
## X4 -82.93348 839.71770 687.79361 2880.71219 568.60860 -224.53663 610.0951
## X5 124.14770 877.36930 706.96116 568.60860 1789.52529 64.51683 682.3657
## X6 -27.39895 89.23094 90.30638 -224.53663 64.51683 2358.98040 211.7858
## X8 135.31942 914.54798 874.23149 610.09515 682.36574 211.78578 3165.9178
## X9 154.98227 409.54960 330.09685 407.98972 506.69553 49.58338 452.7041
## X9
## X1 154.98227
## X2 409.54960
## X3 330.09685
## X4 407.98972
## X5 506.69553
## X6 49.58338
## X8 452.70414
## X9 1680.22332
cat(red(" you could obtain the Bonferroni simultaneous confidence intervals for each feature Effect as follows
\n"))
## you could obtain the Bonferroni simultaneous confidence intervals for each feature Effect as follows
X= as.matrix(Data [, c("X1","X2","X3","X4","X5","X7","X8","X9")])
mod= lm(X ~ Factor1 )
p <- ncol(X)
lsm.F1<- vector("list", p)
names(lsm.F1) <- colnames(X)
for(j in 1:p){
wts <- rep(0, p*1)
wts[1:1 ] <- 1
lsm.F1[[j]] <- lsmeans(mod, "Factor1", weights= wts )
}
lsm.FA <- vector("list", p)
names(lsm.FA) <- colnames(X)
for(j in 1:p){
wts <- rep(1, p*1)
wts[1:1 ] <- 0
lsm.FA[[j]] <- lsmeans(mod, "Factor1", weights= wts )
}
q <- p * 2 * (2-1)/2
alp <- 0.05 / (2*q)
# Bonferroni pairwise CIs for Factor 1
CI2= rbind( confint(contrast(lsm.F1[[1]], "pairwise"), level=1-alp, adj="none") ,
confint(contrast(lsm.FA[[1]], "pairwise"), level=1-alp, adj="none") )
NN=c("X1", "X2", "X3","X4","X5","X7","X8","X9" )
CI2[, c(3,5,6)]=format(round( ( CI2[, c(3,5,6)]),5), nsmall = 5)
CI2= data.frame( NN, CI2)
CI2
cat(red("*** Multiple Linear Regression *** \n"))
## *** Multiple Linear Regression ***
fit1<- lm(Class~X1+X2+X3+X4+X5+X6+X7+X8+X9,data = Data)
summary(fit1)
##
## Call:
## lm(formula = Class ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 +
## X9, data = Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.90532 -0.21030 -0.02616 0.15826 1.50185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.340191 0.039232 34.161 < 2e-16 ***
## X1 0.082907 0.007781 10.655 < 2e-16 ***
## X2 0.034956 0.014171 2.467 0.013884 *
## X3 0.055610 0.013661 4.071 5.24e-05 ***
## X4 0.048292 0.008587 5.624 2.72e-08 ***
## X5 0.020728 0.011663 1.777 0.075980 .
## X6 0.058980 0.008534 6.911 1.11e-11 ***
## X7 0.064580 0.011058 5.840 8.07e-09 ***
## X8 0.029346 0.008334 3.521 0.000458 ***
## X9 -0.001201 0.011054 -0.109 0.913478
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4243 on 681 degrees of freedom
## Multiple R-squared: 0.8035, Adjusted R-squared: 0.8009
## F-statistic: 309.5 on 9 and 681 DF, p-value: < 2.2e-16
confint(fit1)
## 2.5 % 97.5 %
## (Intercept) 1.263161049 1.41722010
## X1 0.067629873 0.09818383
## X2 0.007130670 0.06278057
## X3 0.028786906 0.08243314
## X4 0.031432155 0.06515114
## X5 -0.002172233 0.04362816
## X6 0.042223089 0.07573684
## X7 0.042869037 0.08629193
## X8 0.012982802 0.04570877
## X9 -0.022905253 0.02050228
cat(red ("The fitted model is: \n \n"))
## The fitted model is:
##
par(mfrow=c(1,2))
plot(fit1$fitted,fit1$resid, lwd=2, col="blue")
plot(density(fit1$resid), lwd=2, col="blue")
par(mfrow=c(2,2))
plot(fit1, lwd=2, col = "blue")
cat(red("The residuals are not normal distributed! \n"))
## The residuals are not normal distributed!
shapiro.test(fit1$resid)
##
## Shapiro-Wilk normality test
##
## data: fit1$resid
## W = 0.94072, p-value = 5.697e-16
cat(blue("NOTE \n"))
## NOTE
cat(" in this case, the Class is classeied as benign \n \n")
## in this case, the Class is classeied as benign
##
newdata = data.frame(X1=.95, X2=.95,X3=.95,X4=.95,X5=.95,X6=.95,X7=.95,X8=.95,X9=.95)
predict(fit1,new= newdata,interval='prediction')
## fit lwr upr
## 1 1.714678 0.8792744 2.550081
cat(red("Confidence and Prediction Intervals \n\n"))
## Confidence and Prediction Intervals
cat (red("first:Prediction \n\n "))
## first:Prediction
##
##
cat(blue("NOTE \n"))
## NOTE
cat(" in this case, the Class is classeied as malignant \n \n")
## in this case, the Class is classeied as malignant
##
newdata = data.frame(X1=10, X2=10,X3=10,X4=10,X5=10,X6=10,X7=10,X8=10,X9=10)
predict(fit1,new= newdata,interval='prediction')
## fit lwr upr
## 1 5.282159 4.429476 6.134843
cat(blue("NOTE \n"))
## NOTE
cat(" in this case, the Class is classeied as benign \n \n")
## in this case, the Class is classeied as benign
##
newdata = data.frame(X1=.95, X2=.95,X3=.95,X4=.95,X5=.95,X6=.95,X7=.95,X8=.95,X9=.95)
predict(fit1,new= newdata,interval='confidence')
## fit lwr upr
## 1 1.714678 1.653201 1.776154
cat(red("second :Confidence \n\n"))
## second :Confidence
cat(blue("NOTE \n"))
## NOTE
cat(" in this case, the Class is classeied as malignant \n \n")
## in this case, the Class is classeied as malignant
##
newdata = data.frame(X1=10, X2=10,X3=10,X4=10,X5=10,X6=10,X7=10,X8=10,X9=10)
predict(fit1,new= newdata,interval='confidence')
## fit lwr upr
## 1 5.282159 5.100638 5.463681
cat(red("*** Principal Components Analysis *** \n \n"))
## *** Principal Components Analysis ***
##
#install.packages("robustHD")
cat(red("*** PCA of correlation matrix *** \n\n"))
## *** PCA of correlation matrix ***
cat(red(" we obtain the sample correlation matrix R for these data, and determine its eigenvalues .and eigenvectors. \n\n"))
## we obtain the sample correlation matrix R for these data, and determine its eigenvalues .and eigenvectors.
#x<-standardize(data[,2:8],centerFun = mean, scaleFun = sd)
#data[,2:8]<-(-1)*data[,2:8]#resign lower scores correspond better berformane
x<-Data[,2:10]
cor(x)
## X1 X2 X3 X4 X5 X6 X7
## X1 1.0000000 0.6433396 0.6537521 0.4879492 0.5174478 0.3575856 0.5610764
## X2 0.6433396 1.0000000 0.9054195 0.7131170 0.7471112 0.4351545 0.7595252
## X3 0.6537521 0.9054195 1.0000000 0.6909890 0.7143934 0.4359314 0.7384546
## X4 0.4879492 0.7131170 0.6909890 1.0000000 0.6084773 0.3033615 0.6698127
## X5 0.5174478 0.7471112 0.7143934 0.6084773 1.0000000 0.3658206 0.6205176
## X6 0.3575856 0.4351545 0.4359314 0.3033615 0.3658206 1.0000000 0.3796545
## X7 0.5610764 0.7595252 0.7384546 0.6698127 0.6205176 0.3796545 1.0000000
## X8 0.5357118 0.7272394 0.7246932 0.6024535 0.6340581 0.4100786 0.6690593
## X9 0.3503536 0.4600643 0.4405924 0.4171673 0.4826440 0.2348477 0.3438210
## X8 X9
## X1 0.5357118 0.3503536
## X2 0.7272394 0.4600643
## X3 0.7246932 0.4405924
## X4 0.6024535 0.4171673
## X5 0.6340581 0.4826440
## X6 0.4100786 0.2348477
## X7 0.6690593 0.3438210
## X8 1.0000000 0.4276435
## X9 0.4276435 1.0000000
A<-cor(x)
Eigenvalues <- eigen(A)$values
cat(red("Eigenvalues are: \n\n"))
## Eigenvalues are:
Eigenvalues
## [1] 5.53204193 0.79556702 0.72927651 0.53285720 0.39421739 0.36705283 0.29304970
## [8] 0.26418253 0.09175491
#A
Eigenvectors <- eigen(A)$vectors
cat(red("Eigenvectors are: \n\n"))
## Eigenvectors are:
Eigenvectors
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.3106167 0.115560287 0.10357068 0.88942040 -0.13307153 -0.03267568
## [2,] -0.3949342 -0.008042624 0.12446943 -0.01832534 0.09940433 -0.11558871
## [3,] -0.3896742 0.020683220 0.13498279 0.03409435 0.10234161 -0.06507698
## [4,] -0.3384939 -0.187415308 0.19433955 -0.32594990 -0.66695961 -0.18730854
## [5,] -0.3492368 -0.145715316 -0.05402837 -0.14720369 0.56010138 -0.58049041
## [6,] -0.2248964 0.800332034 -0.50582628 -0.15876554 -0.13818405 -0.08514393
## [7,] -0.3551379 0.046591643 0.29079127 -0.15243440 -0.18179987 0.21398418
## [8,] -0.3518838 0.018955246 0.03680689 -0.15050751 0.35669070 0.73385743
## [9,] -0.2421370 -0.535475903 -0.75703280 0.09179097 -0.15496444 0.13303188
## [,7] [,8] [,9]
## [1,] -0.14129813 0.22206507 -1.771339e-02
## [2,] 0.09719380 -0.48803293 -7.465046e-01
## [3,] 0.06071773 -0.61234176 6.595344e-01
## [4,] -0.45955381 0.12108496 2.269974e-02
## [5,] -0.06524527 0.41599897 6.494560e-02
## [6,] -0.01779783 0.03409472 9.164522e-05
## [7,] 0.74618486 0.35501934 4.732318e-02
## [8,] -0.40550121 0.14582125 -1.847127e-02
## [9,] 0.17295974 -0.03697033 1.084722e-02
p1 <- princomp(x, cor = TRUE) ## using correlation matrix
summary(p1)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.3520293 0.89194564 0.85397688 0.72997068 0.62786733
## Proportion of Variance 0.6146713 0.08839634 0.08103072 0.05920636 0.04380193
## Cumulative Proportion 0.6146713 0.70306766 0.78409838 0.84330474 0.88710667
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.60584885 0.54134065 0.51398690 0.30291073
## Proportion of Variance 0.04078365 0.03256108 0.02935361 0.01019499
## Cumulative Proportion 0.92789032 0.96045140 0.98980501 1.00000000
# PCA on correlation matrix
pcaCOR <- princomp(x=Data[,2:10], cor=TRUE)
# resign PCA solution
pcsign <- sign(colSums(pcaCOR$loadings^3))
pcaCOR$loadings <- pcaCOR$loadings %*% diag(pcsign)
pcaCOR$loadings
## [,1] [,2] [,3] [,4] [,5] [,6]
## X1 0.3106167 0.115560287 -0.10357068 0.88942040 0.13307153 -0.03267568
## X2 0.3949342 -0.008042624 -0.12446943 -0.01832534 -0.09940433 -0.11558871
## X3 0.3896742 0.020683220 -0.13498279 0.03409435 -0.10234161 -0.06507698
## X4 0.3384939 -0.187415308 -0.19433955 -0.32594990 0.66695961 -0.18730854
## X5 0.3492368 -0.145715316 0.05402837 -0.14720369 -0.56010138 -0.58049041
## X6 0.2248964 0.800332034 0.50582628 -0.15876554 0.13818405 -0.08514393
## X7 0.3551379 0.046591643 -0.29079127 -0.15243440 0.18179987 0.21398418
## X8 0.3518838 0.018955246 -0.03680689 -0.15050751 -0.35669070 0.73385743
## X9 0.2421370 -0.535475903 0.75703280 0.09179097 0.15496444 0.13303188
## [,7] [,8] [,9]
## X1 -0.14129813 -0.22206507 1.771339e-02
## X2 0.09719380 0.48803293 7.465046e-01
## X3 0.06071773 0.61234176 -6.595344e-01
## X4 -0.45955381 -0.12108496 -2.269974e-02
## X5 -0.06524527 -0.41599897 -6.494560e-02
## X6 -0.01779783 -0.03409472 -9.164522e-05
## X7 0.74618486 -0.35501934 -4.732318e-02
## X8 -0.40550121 -0.14582125 1.847127e-02
## X9 0.17295974 0.03697033 -1.084722e-02
*** The total sample variance explained by the first six sample variances is approximately \(92.8\%\) of the total sample variance. According to the outputs above, the first six principal combonents for the standardized variables is given as below.
\[y_1^{*}= e_1^{*}X=0.3106167 X1 + 0.3949342 X2 + 0.3896742X3 + 0.3384939 X4 + 0.3492368 X5 + 0.2248964 X6 + 0.3551379 X7 + 0.3518838 X8 + 0.2421370 X9\] \(y_2^{*}= e_2^{*}X=0.115560287 X1 -0.008042624 X2 + 0.020683220X3 -0.187415308 X4 -0.145715316 X5 + 0.800332034 X6 +0.046591643 X7 + 0.018955246 X8 -0.535475903 X9\) \(y_3^{*}= e_3^{*}X=-0.10357068 X1 -0.12446943 X2 -0.13498279 X3 -0.19433955 X4 + 0.05402837 X5 + 0.50582628 X6 -0.29079127 X7 -0.03680689 X8 + 0.75703280 X9\) \(y_4^{*}= e_4^{*}X=0.88942040X1 -0.01832534 X2 + 0.03409435 X3 -0.32594990 X4 -0.14720369 X5 -0.15876554 X6 -0.15243440 X7 -0.15050751 X8 + 0.09179097 X9\) \(y_5^{*}= e_5^{*}X=0.13307153 X1 -0.09940433 X2 -0.10234161 X3+ 0.66695961 X4 -0.56010138X5 + 0.13818405 X6 + 0.18179987 X7 -0.35669070 X8 + 0.15496444 X9\) \(y_6^{*}= e_6^{*}X=-0.03267568X1 -0.11558871X2 -0.06507698 X3 -0.18730854 X4 -0.58049041 X5 -0.08514393 X6 + 0.21398418 X7 +0.73385743 X8 +0.13303188 X9\)
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
#install.packages("Hmisc")
library("Hmisc")
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:robustbase':
##
## heart
## The following object is masked from 'package:boot':
##
## aml
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:DescTools':
##
## Label, Mean, %nin%, Quantile
## The following objects are masked from 'package:TeachingDemos':
##
## cnvrt.coords, subplot
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, units
S<-rcorr(as.matrix(x), type = c("pearson"))
flattenCorrMatrix(S$r, S$P)
Data$scores <- p1$scores[,1]
df_sorted_names_asc <- Data[with(Data, order(scores)), ]
y<-df_sorted_names_asc[c(1,11,12)]
z<-na.omit(y)
z
#install.packages("ggfortify")
library(ggfortify)
autoplot(prcomp(Data[,2:8],scale = TRUE), data = data1,colour = 'Breed', shape = TRUE, label.size = 3, loadings.colour = 'blue',
loadings.label = TRUE, loadings.label.size = 3, outlier.color = "red")
cat(red("*** PCA of covariance matrix *** \n\n"))
## *** PCA of covariance matrix ***
cat(red(" we obtain the sample covariance matrix R for these data, and determine its eigenvalues .and eigenvectors. \n\n"))
## we obtain the sample covariance matrix R for these data, and determine its eigenvalues .and eigenvectors.
#install.packages("robustHD")
library(robustHD)
#x<-standardize(data[,2:8],centerFun = mean, scaleFun = sd)
x<-Data[,2:10]
cov(x)
## X1 X2 X3 X4 X5 X6 X7 X8
## X1 7.929071 5.509532 5.448780 3.938633 3.205319 2.1613499 3.858693 4.625479
## X2 5.509532 9.249678 8.150574 6.217043 4.998528 2.8407999 5.641733 6.781956
## X3 5.448780 8.150574 8.760926 5.862812 4.651639 2.7696638 5.338336 6.577237
## X4 3.938633 6.217043 5.862812 8.217119 3.837052 1.8666121 4.689431 5.295386
## X5 3.205319 4.998528 4.651639 3.837052 4.839351 1.7274104 3.333916 4.276977
## X6 2.161350 2.840800 2.769664 1.866612 1.727410 4.6075253 1.990350 2.699077
## X7 3.858693 5.641733 5.338336 4.689431 3.333916 1.9903500 5.965050 5.010556
## X8 4.625479 6.781956 6.577237 5.295386 4.276977 2.6990772 5.010556 9.402181
## X9 1.699948 2.411013 2.247138 2.060572 1.829525 0.8686361 1.446962 2.259508
## X9
## X1 1.6999476
## X2 2.4110132
## X3 2.2471382
## X4 2.0605717
## X5 1.8295245
## X6 0.8686361
## X7 1.4469620
## X8 2.2595084
## X9 2.9691730
A<-cov(x)
Eigenvalues <- eigen(A)$values
cat(red("Eigenvalues are: \n \n"))
## Eigenvalues are:
##
Eigenvalues
## [1] 41.1747397 4.3889468 3.8191682 3.1531855 2.6494007 2.5528957 1.7877936
## [8] 1.5905766 0.8233674
#A
Eigenvectors <- eigen(A)$vectors
cat(red("Eigenvectors are: \n \n"))
## Eigenvectors are:
##
Eigenvectors
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.3236794 0.76783955 -0.43169732 -0.141916368 0.2757022 -0.12294318
## [2,] -0.4449478 -0.03312929 -0.05464897 0.085402875 -0.4123807 0.17170064
## [3,] -0.4283637 0.02701938 -0.04558182 0.053592794 -0.3928750 0.21594767
## [4,] -0.3587123 -0.51042556 -0.35812814 0.351172865 0.5178814 -0.18383038
## [5,] -0.2739385 -0.09496559 0.02593170 0.009435063 -0.3805591 -0.29470087
## [6,] -0.1676658 0.33012537 0.63740285 0.647449577 0.1760703 -0.07816444
## [7,] -0.3178184 -0.10801569 -0.04160626 0.049253407 0.1399403 0.45879674
## [8,] -0.4017347 -0.12190582 0.52098868 -0.649898737 0.3262109 -0.02376615
## [9,] -0.1397485 -0.05942164 0.01422342 -0.047929985 -0.1719181 -0.75561849
## [,7] [,8] [,9]
## [1,] 0.01252450 0.08581056 0.022453558
## [2,] -0.19644218 -0.10839923 0.735367173
## [3,] -0.35916736 -0.20046082 -0.663818276
## [4,] -0.21575182 0.10193334 -0.016707275
## [5,] 0.34887057 0.74121247 -0.111343355
## [6,] -0.01056879 0.01436760 0.001881781
## [7,] 0.76661528 -0.24698526 -0.066246728
## [8,] -0.13967133 0.05343578 0.024673206
## [9,] 0.23806328 -0.56289586 -0.019971028
p1 <- princomp(x) ## using covariance matrix
summary(p1)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 6.4121098 2.09346487 1.95285462 1.77443577 1.62651976
## Proportion of Variance 0.6647512 0.07085795 0.06165908 0.05090703 0.04277361
## Cumulative Proportion 0.6647512 0.73560917 0.79726825 0.84817529 0.89094890
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 1.59662182 1.33611615 1.26026774 0.90673913
## Proportion of Variance 0.04121557 0.02886328 0.02567928 0.01329297
## Cumulative Proportion 0.93216447 0.96102775 0.98670703 1.00000000
cat(red("Loadings (which should not be confused with eigenvectors) have the following properties: \n"))
## Loadings (which should not be confused with eigenvectors) have the following properties:
cat("Their sums of squares within each component are the eigenvalues (components' variances).\n")
## Their sums of squares within each component are the eigenvalues (components' variances).
cat("Loadings are coefficients in linear combination predicting a variable by the (standardized) components.\n")
## Loadings are coefficients in linear combination predicting a variable by the (standardized) components.
# PCA on covariance matrix (default)
pcaCOV <- princomp(x=Data[,2:10])
names(pcaCOV)
## [1] "sdev" "loadings" "center" "scale" "n.obs" "scores" "call"
#resign PCA solution
pcsign <- sign(colSums(pcaCOV$loadings^3))
pcaCOV$loadings <- pcaCOV$loadings %*% diag(pcsign)
pcaCOV$loadings
## [,1] [,2] [,3] [,4] [,5] [,6]
## X1 0.3236794 0.76783955 -0.43169732 -0.141916368 0.2757022 0.12294318
## X2 0.4449478 -0.03312929 -0.05464897 0.085402875 -0.4123807 -0.17170064
## X3 0.4283637 0.02701938 -0.04558182 0.053592794 -0.3928750 -0.21594767
## X4 0.3587123 -0.51042556 -0.35812814 0.351172865 0.5178814 0.18383038
## X5 0.2739385 -0.09496559 0.02593170 0.009435063 -0.3805591 0.29470087
## X6 0.1676658 0.33012537 0.63740285 0.647449577 0.1760703 0.07816444
## X7 0.3178184 -0.10801569 -0.04160626 0.049253407 0.1399403 -0.45879674
## X8 0.4017347 -0.12190582 0.52098868 -0.649898737 0.3262109 0.02376615
## X9 0.1397485 -0.05942164 0.01422342 -0.047929985 -0.1719181 0.75561849
## [,7] [,8] [,9]
## X1 0.01252450 0.08581056 0.022453558
## X2 -0.19644218 -0.10839923 0.735367173
## X3 -0.35916736 -0.20046082 -0.663818276
## X4 -0.21575182 0.10193334 -0.016707275
## X5 0.34887057 0.74121247 -0.111343355
## X6 -0.01056879 0.01436760 0.001881781
## X7 0.76661528 -0.24698526 -0.066246728
## X8 -0.13967133 0.05343578 0.024673206
## X9 0.23806328 -0.56289586 -0.019971028
*** The total sample variance explained by the first six sample variances is approximately \(92.8\%\) of the total sample variance. According to the outputs above, the first six principal combonents for the standardized variables is given as below.
\(y_1^{*}= e_1^{*}X=0.3236794 X1 + 0.4449478 X2 + 0.4283637 X3 + 0.3587123 X4 + 0.27393855 X5 + 0.1676658 X6 + 0.3178184 X7 + 0.4017347 X8 + 0.1397485 X9\) \(y_2^{*}= e_2^{*}X=0.76783955 X1 -0.03312929 X2 +0.02701938 X3 -0.51042556 X4 -0.09496559 X5 + 0.33012537 X6 -0.10801569 X7 - 0.10839923 X8 -0.05942164 X9\) \(y_3^{*}= e_3^{*}X=-0.43169732 X1 -0.05464897 X2 -0.04558182 X3 -0.35812814 X4 + 0.02593170 X5 + 0.63740285 X6 -0.04160626 X7 + 0.52098868 X8 + 0.01422342X9\) \(y_4^{*}= e_4^{*}X=-0.141916368 X1 + 0.085402875 X2 + 0.053592794 X3 +0.351172865 X4 + 0.009435063 X5 + 0.647449577 X6 + 0.049253407 X7-0.649898737 X8 -0.047929985 X9\) \(y_5^{*}= e_5^{*}X=0.2757022 X1 -0.4123807 X2 -0.3928750 X3 + 0.5178814 X4 -0.3805591 X5 + 0.1760703 X6 + 0.1399403 X7 + 0.3262109 X8 -0.1719181 X9\) \(y_6^{*}= e_6^{*}X=0.12294318 X1 -0.17170064 X2 -0.21594767 X3 +0.18383038 X4 + 0.29470087 X5 +0.07816444 X6 -0.45879674 X7 + 0.02376615 X8 +0.75561849 X9\)
According to the above results,the total sample variance explained by the first six sample variances is approximately 93.2% of the total sample variance.
** The total variance explained by the first six PCs in the covariance matrix is greater than the total variance explained by the first PCs in the correlation matrix.
Data$scores <- p1$scores[,1]
df_sorted_names_asc <- Data[with(Data, order(scores)), ]
y<-df_sorted_names_asc[c(1,11,12)]
z<-na.omit(y)
z
dev.new(width=10, height=5, noRStudioGD=TRUE)
par(mfrow=c(1,2))
plot(pcaCOV$loadings[,1:2], xlab="PC1 Loaings", ylab="PC2 Loadings",type="n", main="PCA of Covariance Matrix", xlim=c(-0.15, 1.15), ylim=c(-0.15, 1.15))
text(pcaCOV$loadings[,1:2],labels=colnames(Data[,2:10]))
plot(pcaCOR$loadings[,1:2],xlab="PC1 Loaings", ylab="PC2 Loadings", type="n", main="PCA of Correlation Matrix", xlim=c(0.05, 0.45), ylim=c(-0.5,0.6))
text(pcaCOR$loadings[,1:2],labels=colnames(Data[,2:10]))
dev.new(width=10, height=5, noRStudioGD=TRUE)
par(mfrow=c(1,2))
plot(2:10, pcaCOV$sdev^2, type="b", xlab="# PCs", ylab="Variance of PC",main="PCA of Covariance Matrix")
plot(2:10, pcaCOR$sdev^2, type="b", xlab="# PCs", ylab="Variance of PC",main="PCA of Correlation Matrix")
cat (red("*** Factor Analysis ***\n"))
## *** Factor Analysis ***
cat(" First : **factor Analysis using PCA solution\n ")
## First : **factor Analysis using PCA solution
##
#install.packages("psych")
#install.packages("GPArotation")
cat(blue("*number of factor is two rotate and non rotate"))
## *number of factor is two rotate and non rotate
fit <- fa((Data[,2:10]), nfactors=2,rotate = "none", fm='pa',scores="tenBerge",cor="cor")
## maximum iteration exceeded
print(fit$loading) # print results
##
## Loadings:
## PA1 PA2
## X1 0.681
## X2 0.939
## X3 0.922 -0.106
## X4 0.759
## X5 0.792 0.110
## X6 0.469
## X7 0.813 -0.166
## X8 0.797
## X9 0.540 0.521
##
## PA1 PA2
## SS loadings 5.206 0.334
## Proportion Var 0.578 0.037
## Cumulative Var 0.578 0.615
factor <- fa(Data[,2:10],nfactors = 2,rotate = "varimax", fm='pa',scores="tenBerge",cor="cor")
## maximum iteration exceeded
factor$loading
##
## Loadings:
## PA1 PA2
## X1 0.630 0.263
## X2 0.875 0.352
## X3 0.870 0.324
## X4 0.676 0.345
## X5 0.656 0.458
## X6 0.434 0.180
## X7 0.800 0.221
## X8 0.714 0.355
## X9 0.244 0.709
##
## PA1 PA2
## SS loadings 4.203 1.336
## Proportion Var 0.467 0.148
## Cumulative Var 0.467 0.615
##install.packages("psych")
##install.packages("GPArotation")
par(mfrow=c(1,2))
library(psych)
library(GPArotation)
fit <- fa.parallel(Data[,2:10], nfactors=2, fm='pa')
## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
print(fit) # print results
## Call: fa.parallel(x = Data[, 2:10], fm = "pa", nfactors = 2)
## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
##
## Eigen Values of
## Original factors Resampled data Simulated data Original components
## 1 5.21 0.4 0.45 5.53
## Resampled components Simulated components
## 1 1.16 1.17
fa.diagram(factor)
****************************************************
##install.packages("psych")
##install.packages("GPArotation")
cat(blue("*number of factor is three rotate and non rotate \n\n"))
## *number of factor is three rotate and non rotate
library(psych)
library(GPArotation)
fit <- fa(Data[,2:10], nfactors=3,rotate = "none", fm='pa',scores="tenBerge",cor="cor")
## maximum iteration exceeded
print(fit$loading) # print results
##
## Loadings:
## PA1 PA2 PA3
## X1 0.682 0.133
## X2 0.937
## X3 0.921 -0.105
## X4 0.780 -0.345
## X5 0.790 0.119
## X6 0.471 0.181
## X7 0.814 -0.171
## X8 0.795
## X9 0.536 0.499
##
## PA1 PA2 PA3
## SS loadings 5.226 0.315 0.190
## Proportion Var 0.581 0.035 0.021
## Cumulative Var 0.581 0.616 0.637
threefactor <- fa(Data[,2:10],nfactors =3,rotate = "varimax", fm='pa',scores="tenBerge",cor="cor")
## maximum iteration exceeded
threefactor$loading
##
## Loadings:
## PA1 PA3 PA2
## X1 0.583 0.303 0.233
## X2 0.715 0.528 0.309
## X3 0.738 0.492 0.281
## X4 0.335 0.722 0.305
## X5 0.523 0.426 0.428
## X6 0.458 0.139 0.163
## X7 0.576 0.584 0.167
## X8 0.590 0.428 0.320
## X9 0.219 0.187 0.674
##
## PA1 PA3 PA2
## SS loadings 2.720 1.894 1.117
## Proportion Var 0.302 0.210 0.124
## Cumulative Var 0.302 0.513 0.637
##install.packages("psych")
##install.packages("GPArotation")
par(mfrow=c(1,2))
library(psych)
library(GPArotation)
fit <- fa.parallel(Data[,2:10], nfactors=3, fm='pa', fa='both')
## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
print(fit) # print results
## Call: fa.parallel(x = Data[, 2:10], fm = "pa", fa = "both", nfactors = 3)
## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
##
## Eigen Values of
## Original factors Resampled data Simulated data Original components
## 1 5.23 0.51 0.44 5.53
## Resampled components Simulated components
## 1 1.18 1.17
fa.diagram(threefactor)
##install.packages("psych")
##install.packages("GPArotation")
cat(blue("*number of factor is four rotate and non rotate \n\n"))
## *number of factor is four rotate and non rotate
library(psych)
library(GPArotation)
fit <- fa(Data[,2:10], nfactors=4,rotate = "none", fm='pa',scores="tenBerge",cor="cor")
print(fit$loading) # print results
##
## Loadings:
## PA1 PA2 PA3 PA4
## X1 0.680 0.119
## X2 0.940 -0.148
## X3 0.929 0.120 -0.186
## X4 0.765 -0.221
## X5 0.790 0.157
## X6 0.477 0.256 0.191
## X7 0.830 -0.263 -0.186 0.147
## X8 0.796 0.113
## X9 0.526 0.418
##
## PA1 PA2 PA3 PA4
## SS loadings 5.244 0.278 0.184 0.131
## Proportion Var 0.583 0.031 0.020 0.015
## Cumulative Var 0.583 0.614 0.634 0.649
fourfactor <- fa(Data[,2:10],nfactors =4,rotate = "varimax", fm='pa',scores="tenBerge",cor="cor")
fourfactor$loading
##
## Loadings:
## PA4 PA1 PA2 PA3
## X1 0.338 0.368 0.260 0.400
## X2 0.505 0.607 0.371 0.384
## X3 0.459 0.657 0.321 0.414
## X4 0.574 0.344 0.395 0.174
## X5 0.406 0.369 0.502 0.313
## X6 0.170 0.156 0.162 0.501
## X7 0.755 0.279 0.197 0.358
## X8 0.468 0.309 0.379 0.434
## X9 0.157 0.148 0.616 0.177
##
## PA4 PA1 PA2 PA3
## SS loadings 1.918 1.410 1.304 1.205
## Proportion Var 0.213 0.157 0.145 0.134
## Cumulative Var 0.213 0.370 0.515 0.649
##install.packages("psych")
##install.packages("GPArotation")
par(mfrow=c(1,2))
library(psych)
library(GPArotation)
fit <- fa.parallel(Data[,2:10], nfactors=4, fm='pa', fa='both')
## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
print(fit) # print results
## Call: fa.parallel(x = Data[, 2:10], fm = "pa", fa = "both", nfactors = 4)
## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
##
## Eigen Values of
## Original factors Resampled data Simulated data Original components
## 1 5.24 0.6 0.62 5.53
## Resampled components Simulated components
## 1 1.17 1.16
fa.diagram(fourfactor)
#install.packages("nFactors")
cat(red( "Second: factor Analysis using maximum liklihood solution \n" ))
## Second: factor Analysis using maximum liklihood solution
cat(blue("number of factor is two just rotate \n\n"))
## number of factor is two just rotate
fit.22 <- factanal(Data[,2:10],factors=2,rotation="varimax" ,scores = "regression")
print(fit.22, digits = 2, cutoff = .3, sort = TRUE)
##
## Call:
## factanal(x = Data[, 2:10], factors = 2, scores = "regression", rotation = "varimax")
##
## Uniquenesses:
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## 0.53 0.10 0.07 0.39 0.36 0.78 0.34 0.37 0.71
##
## Loadings:
## Factor1 Factor2
## X1 0.56 0.39
## X2 0.76 0.56
## X3 0.84 0.46
## X4 0.48 0.61
## X5 0.51 0.62
## X7 0.56 0.59
## X8 0.53 0.59
## X6 0.36
## X9 0.47
##
## Factor1 Factor2
## SS loadings 2.88 2.46
## Proportion Var 0.32 0.27
## Cumulative Var 0.32 0.59
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 68.77 on 19 degrees of freedom.
## The p-value is 1.47e-07
##install.packages("psych")
##install.packages("GPArotation")
library(psych)
library(GPArotation)
fit.2 <- fa.parallel(Data[,2:10], nfactors=2, fm='ml', fa='both')
## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
print(fit.2) # print results
## Call: fa.parallel(x = Data[, 2:10], fm = "ml", fa = "both", nfactors = 2)
## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
##
## Eigen Values of
## Original factors Resampled data Simulated data Original components
## 1 5.2 0.64 0.75 5.53
## Resampled components Simulated components
## 1 1.18 1.18
cat(blue("number of factor is three just rotate \n\n"))
## number of factor is three just rotate
fit.3<- factanal(Data[,2:10],factors=3,rotation="varimax")
print(fit.3, digits=2, cutoff=.3, sort=TRUE)
##
## Call:
## factanal(x = Data[, 2:10], factors = 3, rotation = "varimax")
##
## Uniquenesses:
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## 0.53 0.10 0.07 0.41 0.32 0.79 0.00 0.38 0.62
##
## Loadings:
## Factor1 Factor2 Factor3
## X1 0.52 0.31 0.31
## X2 0.72 0.40 0.46
## X3 0.81 0.36 0.38
## X7 0.40 0.88
## X5 0.46 0.32 0.61
## X9 0.56
## X4 0.45 0.43 0.44
## X6 0.33
## X8 0.50 0.40 0.46
##
## Factor1 Factor2 Factor3
## SS loadings 2.43 1.67 1.67
## Proportion Var 0.27 0.19 0.19
## Cumulative Var 0.27 0.46 0.64
##
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 20.43 on 12 degrees of freedom.
## The p-value is 0.0594
##install.packages("psych")
##install.packages("GPArotation")
library(psych)
library(GPArotation)
fit.3 <- fa.parallel(Data[,2:10], nfactors=3, fm='ml', fa='both')
## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
print(fit.3) # print results
## Call: fa.parallel(x = Data[, 2:10], fm = "ml", fa = "both", nfactors = 3)
## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
##
## Eigen Values of
## Original factors Resampled data Simulated data Original components
## 1 5.25 0.9 0.96 5.53
## Resampled components Simulated components
## 1 1.17 1.18
cat(blue("number of factor is four just rotate\n\n"))
## number of factor is four just rotate
fit<- factanal(Data[,2:10],factors=4,rotation="varimax")
print(fit, digits=2, cutoff=.3, sort=TRUE)
##
## Call:
## factanal(x = Data[, 2:10], factors = 4, rotation = "varimax")
##
## Uniquenesses:
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## 0.52 0.09 0.09 0.39 0.33 0.66 0.00 0.36 0.61
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## X2 0.67 0.39 0.43 0.35
## X3 0.71 0.35 0.36 0.39
## X7 0.33 0.86
## X5 0.41 0.31 0.58
## X9 0.56
## X6 0.50
## X1 0.43 0.38
## X4 0.44 0.43 0.45
## X8 0.39 0.37 0.43 0.40
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 1.83 1.56 1.51 1.05
## Proportion Var 0.20 0.17 0.17 0.12
## Cumulative Var 0.20 0.38 0.54 0.66
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 6.35 on 6 degrees of freedom.
## The p-value is 0.385
##install.packages("psych")
##install.packages("GPArotation")
library(psych)
library(GPArotation)
fit.4 <- fa.parallel(Data[,2:10], nfactors=4, fm='ml', fa='both')
## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
print(fit.4) # print results
## Call: fa.parallel(x = Data[, 2:10], fm = "ml", fa = "both", nfactors = 4)
## Parallel analysis suggests that the number of factors = 1 and the number of components = 1
##
## Eigen Values of
## Original factors Resampled data Simulated data Original components
## 1 5.26 0.91 1.02 5.53
## Resampled components Simulated components
## 1 1.18 1.19
*** the result by the two methods, that is PCA and ML shows that the latter is diffecult to interpret . the factor scores for the first two factors are better in maximum likelihood solution than PCA solution.
cat(red("*** Canonical Correlation Analysis *** \n\n"))
## *** Canonical Correlation Analysis ***
cat("Canonical correlation analysis (CCA) is a multidimensional exploratory statistical method which operates on the same principle as the principal component analysis. The main purpose of the canonical correlation approach is the exploration of sample correlations between two sets of quantitative variables observed on the same experimental units. On the other hand PCA method deals with one data set only and it tries to to reduce the overall dimensionality of the dataset using some linear combination of the initial variables.")
## Canonical correlation analysis (CCA) is a multidimensional exploratory statistical method which operates on the same principle as the principal component analysis. The main purpose of the canonical correlation approach is the exploration of sample correlations between two sets of quantitative variables observed on the same experimental units. On the other hand PCA method deals with one data set only and it tries to to reduce the overall dimensionality of the dataset using some linear combination of the initial variables.
X <- Data[,c(2,3,4,5,11)]
Y <- Data[,c(6,7,8,9,11)]
R12=cor(X[,1:4],Y[,1:4], method = c("pearson", "kendall", "spearman"))
R21=t(R12)
R11=cor(X[,1:4], method = c("pearson", "kendall", "spearman"))
R22=cor(Y[,1:4], method = c("pearson", "kendall", "spearman"))
# Finding the E1 and E2 matrices:
E1 <- solve(R11) %*% R12 %*% solve(R22) %*% R21
E2 <- solve(R22) %*% R21 %*% solve(R11) %*% R12
# The canonical correlations are:
Xeig <- sqrt(eigen(E1)$values)
Yeig <- sqrt(eigen(E2)$values)
cat(" ### Canonical Correlation \n")
## ### Canonical Correlation
Xeig
## [1] 0.88127838 0.14645483 0.08901136 0.05161326
Yeig
## [1] 0.88127838 0.14645483 0.08901136 0.05161326
cat("# The canonical variates are based on the eigenvectors of E1 :\n\n")
## # The canonical variates are based on the eigenvectors of E1 :
eigen(E1)$vectors
## [,1] [,2] [,3] [,4]
## [1,] 0.1491751 0.2073474 -0.2669547 -0.454931942
## [2,] 0.8091378 -0.0670727 0.8195504 -0.432036992
## [3,] 0.4472343 0.5864703 -0.4616679 0.778690262
## [4,] 0.3507483 -0.7801031 -0.2096071 0.004737334
cat("\n# a1 = (0.1491751 ,0.8091378,0.4472343,0.3507483)","\n")
##
## # a1 = (0.1491751 ,0.8091378,0.4472343,0.3507483)
cat("# b1 = (0.2073474, -0.0670727,0.5864703,-0.7801031) \n")
## # b1 = (0.2073474, -0.0670727,0.5864703,-0.7801031)
cat("# c1 = (-0.2669547, 0.8195504,-0.4616679 ,-0.2096071) \n")
## # c1 = (-0.2669547, 0.8195504,-0.4616679 ,-0.2096071)
cat("# d1 = (-0.454931942, -0.432036992,0.778690262,0.004737334) \n \n")
## # d1 = (-0.454931942, -0.432036992,0.778690262,0.004737334)
##
cat("# The canonical variates are based on the eigenvectors of E2 :\n\n")
## # The canonical variates are based on the eigenvectors of E2 :
eigen(E2)$vectors
## [,1] [,2] [,3] [,4]
## [1,] -0.5790806 0.03587854 0.83125068 0.006973835
## [2,] -0.1199505 -0.66790617 -0.06555658 -0.398485477
## [3,] -0.6738946 0.66177701 -0.39863628 -0.510026600
## [4,] -0.4428810 -0.33862260 -0.38185568 0.762255572
cat("\n# a2 = (-0.5790806 , -0.1199505,-0.6738946,-0.4428810)","\n")
##
## # a2 = (-0.5790806 , -0.1199505,-0.6738946,-0.4428810)
cat("# b2 = (0.03587854 , -0.66790617,0.66177701,-0.33862260) \n")
## # b2 = (0.03587854 , -0.66790617,0.66177701,-0.33862260)
cat("# c2 = (0.83125068, -0.06555658, -0.39863628,-0.38185568) \n")
## # c2 = (0.83125068, -0.06555658, -0.39863628,-0.38185568)
cat("# d2 = (0.006973835, -0.398485477,-0.510026600,0.762255572) \n \n")
## # d2 = (0.006973835, -0.398485477,-0.510026600,0.762255572)
##
cat("\n# U1 = (0.1491751 X1+0.8091378 X2 +0.4472343 X3+ 0.3507483 X4)","\n")
##
## # U1 = (0.1491751 X1+0.8091378 X2 +0.4472343 X3+ 0.3507483 X4)
cat("# U2 = (0.2073474 X1 -0.0670727 X2 + 0.5864703 X3- 0.7801031 X4) \n")
## # U2 = (0.2073474 X1 -0.0670727 X2 + 0.5864703 X3- 0.7801031 X4)
cat("# U3 = (-0.2669547 X1+ 0.8195504 X2 -0.4616679 X3 - 0.2096071 X4) \n")
## # U3 = (-0.2669547 X1+ 0.8195504 X2 -0.4616679 X3 - 0.2096071 X4)
cat("# U4 = (-0.454931942 X1 -0.432036992 X2 +0.778690262 X3+0.004737334 X4) \n \n")
## # U4 = (-0.454931942 X1 -0.432036992 X2 +0.778690262 X3+0.004737334 X4)
##
cat("\n# V1 = (-0.5790806 X5 -0.1199505 X6 -0.6738946 X7-0.4428810 X8)","\n")
##
## # V1 = (-0.5790806 X5 -0.1199505 X6 -0.6738946 X7-0.4428810 X8)
cat("# V2 = (0.03587854 X5 - 0.66790617 X6+ 0.66177701 X7 - 0.33862260 X8) \n")
## # V2 = (0.03587854 X5 - 0.66790617 X6+ 0.66177701 X7 - 0.33862260 X8)
cat("# V3 = (0.83125068 X5 - 0.06555658 X6- 0.39863628 X7 - 0.38185568 X8) \n")
## # V3 = (0.83125068 X5 - 0.06555658 X6- 0.39863628 X7 - 0.38185568 X8)
cat("# V4 = (0.006973835 X5+ -0.398485477 X6- 0.510026600 X7 +0.762255572 X8 ) \n \n")
## # V4 = (0.006973835 X5+ -0.398485477 X6- 0.510026600 X7 +0.762255572 X8 )
##
u1<- solve(sqrt(R11)) %*% (eigen(E1)$vectors[,1])
v1 <- solve(sqrt(R22)) %*% (eigen(E2)$vectors[,1])
u2<- solve(sqrt(R11)) %*% (eigen(E1)$vectors[,2])
v2 <- solve(sqrt(R22)) %*% (eigen(E2)$vectors[,2])
u3<- solve(sqrt(R11)) %*% (eigen(E1)$vectors[,3])
v3 <- solve(sqrt(R22)) %*% (eigen(E2)$vectors[,3])
u4<- solve(sqrt(R11)) %*% (eigen(E1)$vectors[,4])
v4 <- solve(sqrt(R22)) %*% (eigen(E2)$vectors[,4])
par(mfrow=c(2,2))
plot(u1,v1)
plot(u2,v2)
plot(u3,v3)
plot(u4,v4)
cor(u1,v1)
## [,1]
## [1,] 0.7806836
cor(u2,v2)
## [,1]
## [1,] 0.9670324
cor(u3,v3)
## [,1]
## [1,] 0.246139
cor(u4,v4)
## [,1]
## [1,] -0.304574
## Bartlett's test for the significance of the first canonical correlation:
## The null hypothesis is that the first (and smaller) canonical correlations are zero.
my.n <- 691; my.q1 <- 4; my.q2 <- 4
test.stat <- -( (my.n-1) - 0.5*(my.q1+my.q2+1) ) * sum(log(1-eigen(E1)$values))
test.stat
## [1] 1049.725
P.value <- pchisq(test.stat, df = my.q1*my.q2, lower.tail=F)
P.value
## [1] 2.505677e-213
** Since the P-value is tiny, we conclude that there is at least one nonzero canonical correlation.
cat(red("*** Another Code ***"))
## *** Another Code ***
#install.packages("vegan")
#install.packages("CCA")
#library("vegan")
#library("CCA")
#correl <- matcor(X, Y )
#img.matcor(correl, type = 2)
#cc1 <- cancor(X, Y) ### function from standard R instalation
#cc2 <- cc(X, Y) ### function for the R package 'CCA'
#cc1$cor ### function from standard R instalation
#cc2$cor ### function for the R package 'CCA'
#par(mfrow = c(1,2))
#barplot(cc1$cor, main = "Canonical correlations for 'cancor()'", col = "gray")
#barplot(cc2$cor, main = "Canonical correlations for 'cancor()'", col = "gray")
#cc1$xcoef ### function from standard R instalation
#cc2$xcoef ### function for the R package 'CCA'
#cc1$ycoef
#cc2$ycoef
#plt.cc(cc2, var.label = TRUE, ind.names = Data[,11])
#library(vegan)
#cc3 <- cca(X, Y)
#plot(cc3, scaling = 1)